library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggpubr)
library(dplyr)
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-3
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:plotly':
##
## select
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(class)
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(viridis)
## Loading required package: viridisLite
library(plotly)
library(countrycode)
library(rvest)
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(broom)
library(rstatix)
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:stats':
##
## filter
library(rgl)
CovidWHO <- read_csv("https://covid19.who.int/WHO-COVID-19-global-data.csv")
## Rows: 190074 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Country_code, Country, WHO_region
## dbl (4): New_cases, Cumulative_cases, New_deaths, Cumulative_deaths
## date (1): Date_reported
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
VaccineCovid <- read_csv("https://covid19.who.int/who-data/vaccination-data.csv")
## Rows: 228 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): COUNTRY, ISO3, WHO_REGION, DATA_SOURCE, VACCINES_USED
## dbl (7): TOTAL_VACCINATIONS, PERSONS_VACCINATED_1PLUS_DOSE, TOTAL_VACCINATI...
## date (2): DATE_UPDATED, FIRST_VACCINE_DATE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
CovidWHO_2021_03_01 <- filter(CovidWHO, Date_reported == "2021-03-01")
CovidWHO_2022_03_01 <- filter(CovidWHO, Date_reported == "2022-03-01")
One_Year_Cum <- data.frame(CovidWHO_2021_03_01$Country, CovidWHO_2021_03_01$WHO_region, CovidWHO_2022_03_01$Cumulative_cases - CovidWHO_2021_03_01$Cumulative_cases, CovidWHO_2022_03_01$Cumulative_deaths - CovidWHO_2021_03_01$Cumulative_deaths)
names(One_Year_Cum)[1] <- 'COUNTRY'
names(One_Year_Cum)[2] <- 'WHO_REGION'
names(One_Year_Cum)[3] <- 'CaseCum'
names(One_Year_Cum)[4] <- 'DeathCum'
One_Year_Cum
## COUNTRY WHO_REGION
## 1 Afghanistan EMRO
## 2 Albania EURO
## 3 Algeria AFRO
## 4 American Samoa WPRO
## 5 Andorra EURO
## 6 Angola AFRO
## 7 Anguilla AMRO
## 8 Antigua and Barbuda AMRO
## 9 Argentina AMRO
## 10 Armenia EURO
## 11 Aruba AMRO
## 12 Australia WPRO
## 13 Austria EURO
## 14 Azerbaijan EURO
## 15 Bahamas AMRO
## 16 Bahrain EMRO
## 17 Bangladesh SEARO
## 18 Barbados AMRO
## 19 Belarus EURO
## 20 Belgium EURO
## 21 Belize AMRO
## 22 Benin AFRO
## 23 Bermuda AMRO
## 24 Bhutan SEARO
## 25 Bolivia (Plurinational State of) AMRO
## 26 Bonaire AMRO
## 27 Bosnia and Herzegovina EURO
## 28 Botswana AFRO
## 29 Brazil AMRO
## 30 British Virgin Islands AMRO
## 31 Brunei Darussalam WPRO
## 32 Bulgaria EURO
## 33 Burkina Faso AFRO
## 34 Burundi AFRO
## 35 Cabo Verde AFRO
## 36 Cambodia WPRO
## 37 Cameroon AFRO
## 38 Canada AMRO
## 39 Cayman Islands AMRO
## 40 Central African Republic AFRO
## 41 Chad AFRO
## 42 Chile AMRO
## 43 China WPRO
## 44 Colombia AMRO
## 45 Comoros AFRO
## 46 Congo AFRO
## 47 Cook Islands WPRO
## 48 Costa Rica AMRO
## 49 Côte d’Ivoire AFRO
## 50 Croatia EURO
## 51 Cuba AMRO
## 52 Curaçao AMRO
## 53 Cyprus EURO
## 54 Czechia EURO
## 55 Democratic People's Republic of Korea SEARO
## 56 Democratic Republic of the Congo AFRO
## 57 Denmark EURO
## 58 Djibouti EMRO
## 59 Dominica AMRO
## 60 Dominican Republic AMRO
## 61 Ecuador AMRO
## 62 Egypt EMRO
## 63 El Salvador AMRO
## 64 Equatorial Guinea AFRO
## 65 Eritrea AFRO
## 66 Estonia EURO
## 67 Eswatini AFRO
## 68 Ethiopia AFRO
## 69 Falkland Islands (Malvinas) AMRO
## 70 Faroe Islands EURO
## 71 Fiji WPRO
## 72 Finland EURO
## 73 France EURO
## 74 French Guiana AMRO
## 75 French Polynesia WPRO
## 76 Gabon AFRO
## 77 Gambia AFRO
## 78 Georgia EURO
## 79 Germany EURO
## 80 Ghana AFRO
## 81 Gibraltar EURO
## 82 Greece EURO
## 83 Greenland EURO
## 84 Grenada AMRO
## 85 Guadeloupe AMRO
## 86 Guam WPRO
## 87 Guatemala AMRO
## 88 Guernsey EURO
## 89 Guinea AFRO
## 90 Guinea-Bissau AFRO
## 91 Guyana AMRO
## 92 Haiti AMRO
## 93 Holy See EURO
## 94 Honduras AMRO
## 95 Hungary EURO
## 96 Iceland EURO
## 97 India SEARO
## 98 Indonesia SEARO
## 99 Iran (Islamic Republic of) EMRO
## 100 Iraq EMRO
## 101 Ireland EURO
## 102 Isle of Man EURO
## 103 Israel EURO
## 104 Italy EURO
## 105 Jamaica AMRO
## 106 Japan WPRO
## 107 Jersey EURO
## 108 Jordan EMRO
## 109 Kazakhstan EURO
## 110 Kenya AFRO
## 111 Kiribati WPRO
## 112 Kosovo[1] EURO
## 113 Kuwait EMRO
## 114 Kyrgyzstan EURO
## 115 Lao People's Democratic Republic WPRO
## 116 Latvia EURO
## 117 Lebanon EMRO
## 118 Lesotho AFRO
## 119 Liberia AFRO
## 120 Libya EMRO
## 121 Liechtenstein EURO
## 122 Lithuania EURO
## 123 Luxembourg EURO
## 124 Madagascar AFRO
## 125 Malawi AFRO
## 126 Malaysia WPRO
## 127 Maldives SEARO
## 128 Mali AFRO
## 129 Malta EURO
## 130 Marshall Islands WPRO
## 131 Martinique AMRO
## 132 Mauritania AFRO
## 133 Mauritius AFRO
## 134 Mayotte AFRO
## 135 Mexico AMRO
## 136 Micronesia (Federated States of) WPRO
## 137 Monaco EURO
## 138 Mongolia WPRO
## 139 Montenegro EURO
## 140 Montserrat AMRO
## 141 Morocco EMRO
## 142 Mozambique AFRO
## 143 Myanmar SEARO
## 144 Namibia AFRO
## 145 Nauru WPRO
## 146 Nepal SEARO
## 147 Netherlands EURO
## 148 New Caledonia WPRO
## 149 New Zealand WPRO
## 150 Nicaragua AMRO
## 151 Niger AFRO
## 152 Nigeria AFRO
## 153 Niue WPRO
## 154 North Macedonia EURO
## 155 Northern Mariana Islands (Commonwealth of the) WPRO
## 156 Norway EURO
## 157 occupied Palestinian territory, including east Jerusalem EMRO
## 158 Oman EMRO
## 159 Other Other
## 160 Pakistan EMRO
## 161 Palau WPRO
## 162 Panama AMRO
## 163 Papua New Guinea WPRO
## 164 Paraguay AMRO
## 165 Peru AMRO
## 166 Philippines WPRO
## 167 Pitcairn Islands WPRO
## 168 Poland EURO
## 169 Portugal EURO
## 170 Puerto Rico AMRO
## 171 Qatar EMRO
## 172 Republic of Korea WPRO
## 173 Republic of Moldova EURO
## 174 Réunion AFRO
## 175 Romania EURO
## 176 Russian Federation EURO
## 177 Rwanda AFRO
## 178 Saba AMRO
## 179 Saint Barthélemy AMRO
## 180 Saint Helena AFRO
## 181 Saint Kitts and Nevis AMRO
## 182 Saint Lucia AMRO
## 183 Saint Martin AMRO
## 184 Saint Pierre and Miquelon AMRO
## 185 Saint Vincent and the Grenadines AMRO
## 186 Samoa WPRO
## 187 San Marino EURO
## 188 Sao Tome and Principe AFRO
## 189 Saudi Arabia EMRO
## 190 Senegal AFRO
## 191 Serbia EURO
## 192 Seychelles AFRO
## 193 Sierra Leone AFRO
## 194 Singapore WPRO
## 195 Sint Eustatius AMRO
## 196 Sint Maarten AMRO
## 197 Slovakia EURO
## 198 Slovenia EURO
## 199 Solomon Islands WPRO
## 200 Somalia EMRO
## 201 South Africa AFRO
## 202 South Sudan AFRO
## 203 Spain EURO
## 204 Sri Lanka SEARO
## 205 Sudan EMRO
## 206 Suriname AMRO
## 207 Sweden EURO
## 208 Switzerland EURO
## 209 Syrian Arab Republic EMRO
## 210 Tajikistan EURO
## 211 Thailand SEARO
## 212 The United Kingdom EURO
## 213 Timor-Leste SEARO
## 214 Togo AFRO
## 215 Tokelau WPRO
## 216 Tonga WPRO
## 217 Trinidad and Tobago AMRO
## 218 Tunisia EMRO
## 219 Turkey EURO
## 220 Turkmenistan EURO
## 221 Turks and Caicos Islands AMRO
## 222 Tuvalu WPRO
## 223 Uganda AFRO
## 224 Ukraine EURO
## 225 United Arab Emirates EMRO
## 226 United Republic of Tanzania AFRO
## 227 United States of America AMRO
## 228 United States Virgin Islands AMRO
## 229 Uruguay AMRO
## 230 Uzbekistan EURO
## 231 Vanuatu WPRO
## 232 Venezuela (Bolivarian Republic of) AMRO
## 233 Viet Nam WPRO
## 234 Wallis and Futuna WPRO
## 235 Yemen EMRO
## 236 Zambia AFRO
## 237 Zimbabwe AFRO
## CaseCum DeathCum
## 1 117926 5154
## 2 164396 1673
## 3 151844 3852
## 4 84 0
## 5 27133 41
## 6 77934 1392
## 7 2510 9
## 8 6711 121
## 9 6793291 74187
## 10 247940 5283
## 11 25880 140
## 12 2812847 4262
## 13 2243646 5653
## 14 551403 6199
## 15 24584 591
## 16 392488 1005
## 17 1397575 20629
## 18 52127 283
## 19 632430 4504
## 20 2797457 7815
## 21 44304 336
## 22 20941 93
## 23 10808 111
## 24 12270 5
## 25 644965 9813
## 26 6517 24
## 27 239064 10363
## 28 235580 2309
## 29 18250872 394913
## 30 5932 61
## 31 63356 73
## 32 844241 25390
## 33 8769 233
## 34 35917 12
## 35 40563 254
## 36 129456 3032
## 37 83526 1372
## 38 2419341 14577
## 39 18935 15
## 40 9323 50
## 41 3284 50
## 42 2235394 21781
## 43 233956 1389
## 44 3814566 79033
## 45 4464 16
## 46 15200 250
## 47 25 0
## 48 600958 5228
## 49 48729 601
## 50 811821 9543
## 51 1020083 8172
## 52 34225 239
## 53 286036 623
## 54 2345278 17735
## 55 0 0
## 56 60242 628
## 57 2553335 2212
## 58 9481 126
## 59 10966 57
## 60 334939 1268
## 61 546252 19433
## 62 301347 13386
## 63 87920 2217
## 64 9879 91
## 65 6854 96
## 66 432451 1650
## 67 52146 738
## 68 309655 5097
## 69 59 0
## 70 33722 27
## 71 63871 832
## 72 632312 1913
## 73 18416911 49302
## 74 61025 306
## 75 49264 502
## 76 32978 220
## 77 7227 215
## 78 1341401 12679
## 79 12420150 52832
## 80 76005 835
## 81 11198 8
## 82 2225730 19280
## 83 11717 18
## 84 13542 215
## 85 114560 754
## 86 29773 191
## 87 602449 10578
## 88 12682 14
## 89 20405 351
## 90 4760 119
## 91 54353 1025
## 92 17905 570
## 93 0 0
## 94 242979 6634
## 95 1356656 28993
## 96 125217 33
## 97 31818804 356866
## 98 4247862 112335
## 99 5420260 76765
## 100 1607304 11583
## 101 1080706 1895
## 102 22331 45
## 103 2859921 4479
## 104 9858659 57068
## 105 104723 2391
## 106 4573119 15746
## 107 34292 43
## 108 1240618 9134
## 109 1127945 15558
## 110 216973 3783
## 111 2914 11
## 112 156992 1515
## 113 429042 1455
## 114 114276 1492
## 115 142698 621
## 116 566934 3606
## 117 693677 5399
## 118 22216 405
## 119 5374 209
## 120 361108 4082
## 121 9543 20
## 122 708041 5180
## 123 127823 355
## 124 43828 1069
## 125 53394 1575
## 126 3141984 31619
## 127 150676 235
## 128 22005 369
## 129 48661 290
## 130 0 0
## 131 107722 849
## 132 41446 540
## 133 169177 894
## 134 19062 77
## 135 3414819 108368
## 136 0 0
## 137 7456 27
## 138 905205 2094
## 139 154363 1677
## 140 143 1
## 141 677321 7365
## 142 165697 1551
## 143 448321 16173
## 144 118405 3585
## 145 0 0
## 146 702860 8928
## 147 5273722 5996
## 148 54255 299
## 149 118138 30
## 150 8816 51
## 151 4014 135
## 152 98903 1235
## 153 0 0
## 154 194475 5901
## 155 9407 28
## 156 1192645 1039
## 157 438854 3254
## 158 240748 2674
## 159 19 0
## 160 929387 17318
## 161 3781 6
## 162 414817 2248
## 163 39986 625
## 164 482784 15192
## 165 2192401 88598
## 166 3085648 44133
## 167 0 0
## 168 3965476 67724
## 169 2458056 4746
## 170 373200 2074
## 171 193318 412
## 172 3183425 6565
## 173 316933 7020
## 174 289746 614
## 175 1931490 43064
## 176 12237719 265991
## 177 110652 1196
## 178 202 0
## 179 3099 4
## 180 0 0
## 181 5489 42
## 182 19332 323
## 183 8288 32
## 184 971 1
## 185 6755 98
## 186 32 0
## 187 10661 38
## 188 4148 43
## 189 367644 2504
## 190 51173 1088
## 191 1451716 10798
## 192 36759 152
## 193 3778 46
## 194 664488 990
## 195 438 3
## 196 7497 58
## 197 1149204 11260
## 198 702651 2967
## 199 7031 99
## 200 19056 1109
## 201 2160649 49419
## 202 9052 44
## 203 7890220 25786
## 204 563396 15768
## 205 31001 2018
## 206 69204 1146
## 207 1781971 4557
## 208 2261799 3088
## 209 38987 2048
## 210 4072 34
## 211 2886316 22893
## 212 14723954 37647
## 213 22603 128
## 214 29905 188
## 215 0 0
## 216 355 0
## 217 120062 3489
## 218 764953 19783
## 219 11386977 65876
## 220 0 0
## 221 3753 22
## 222 0 0
## 223 123011 3254
## 224 3490009 80079
## 225 488449 1080
## 226 33111 777
## 227 49824729 421468
## 228 12749 84
## 229 783149 6374
## 230 156575 1014
## 231 17 0
## 232 376056 4295
## 233 3441037 40217
## 234 444 7
## 235 9482 1500
## 236 234216 2861
## 237 200291 3932
VaccineCovid
## # A tibble: 228 × 14
## COUNTRY ISO3 WHO_REGION DATA_SOURCE DATE_UPDATED TOTAL_VACCINATI…
## <chr> <chr> <chr> <chr> <date> <dbl>
## 1 Afghanistan AFG EMRO REPORTING 2022-03-06 5597130
## 2 Albania ALB EURO REPORTING 2022-02-20 2707658
## 3 Algeria DZA AFRO REPORTING 2022-03-09 13704895
## 4 American Samoa ASM WPRO REPORTING 2022-02-16 85050
## 5 Andorra AND EURO REPORTING 2022-02-13 142420
## 6 Angola AGO AFRO REPORTING 2022-03-08 16850195
## 7 Anguilla AIA AMRO REPORTING 2022-02-25 22165
## 8 Antigua and Barbu… ATG AMRO REPORTING 2022-02-25 124726
## 9 Argentina ARG AMRO REPORTING 2022-02-25 93008081
## 10 Armenia ARM EURO REPORTING 2022-02-13 1971565
## # … with 218 more rows, and 8 more variables:
## # PERSONS_VACCINATED_1PLUS_DOSE <dbl>, TOTAL_VACCINATIONS_PER100 <dbl>,
## # PERSONS_VACCINATED_1PLUS_DOSE_PER100 <dbl>, PERSONS_FULLY_VACCINATED <dbl>,
## # PERSONS_FULLY_VACCINATED_PER100 <dbl>, VACCINES_USED <chr>,
## # FIRST_VACCINE_DATE <date>, NUMBER_VACCINES_TYPES_USED <dbl>
NewCovid <- as.data.frame(inner_join(One_Year_Cum, VaccineCovid))
## Joining, by = c("COUNTRY", "WHO_REGION")
library(readxl)
url <- "https://hdr.undp.org/sites/default/files/2020_statistical_annex_table_3.xlsx"
destfile <- "X2020_statistical_annex_table_3.xlsx"
curl::curl_download(url, destfile)
X2020_statistical_annex_table_3 <- read_excel(destfile,
col_types = c("skip", "text", "text",
"numeric", "text", "numeric", "text",
"numeric", "text", "text", "text",
"numeric", "text", "text", "text",
"numeric", "text", "text", "text",
"numeric", "text", "text", "text",
"numeric", "text", "text", "text",
"text", "text", "text", "text", "text"),
skip = 2)
## New names:
## * `` -> ...1
## * `` -> ...3
## * `` -> ...5
## * `` -> ...6
## * `` -> ...7
## * ...
X2020_statistical_annex_table_3 <- as.data.frame(X2020_statistical_annex_table_3[-1:-3,-5:-31])
X2020_statistical_annex_table_3 <- as.data.frame(X2020_statistical_annex_table_3[,-3])
X2020_statistical_annex_table_3 <- as.data.frame(X2020_statistical_annex_table_3[-193:-261,])
X2020_statistical_annex_table_3[,1] <- countrycode(X2020_statistical_annex_table_3[,1], origin = 'country.name', destination = 'iso3c')
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: High human development, Low human development, Medium human development
NewCovid[,1] <- countrycode(NewCovid[,1], origin = 'country.name', destination = 'iso3c')
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: Bonaire, Saba, Sint Eustatius
colnames(X2020_statistical_annex_table_3) <- c("COUNTRY","HDI","IneqHDI")
WikiPop <- read_html("https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)")
PopTable = html_node(WikiPop, ".wikitable")
PopTable = html_table(PopTable, fill = TRUE)
PopTable <- as.data.frame(PopTable)
PopTable[,1] <- countrycode(PopTable[,1], origin = 'country.name', destination = 'iso3c')
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some values were not matched unambiguously: American Samoa (United States), Anguilla (United Kingdom), Aruba (Netherlands), Bermuda (United Kingdom), British Virgin Islands (United Kingdom), Cayman Islands (United Kingdom), Cook Islands (New Zealand), Curaçao (Netherlands), Falkland Islands (United Kingdom), Faroe Islands (Denmark), French Guiana (France), French Polynesia (France), Gibraltar (United Kingdom), Greenland (Denmark), Guadeloupe (France)[v], Guam (United States), Isle of Man (United Kingdom), Mayotte (France), Micronesia, Montserrat (United Kingdom), New Caledonia (France), Niue (New Zealand), Northern Mariana Islands (United States), Puerto Rico (United States), Réunion (France), Saint Helena (United Kingdom)[y], Saint Pierre and Miquelon (France), Sint Maarten (Netherlands), Tokelau (New Zealand), Turks and Caicos Islands (United Kingdom), U.S. Virgin Islands (United States), Wallis and Futuna (France), World
## Warning in countrycode_convert(sourcevar = sourcevar, origin = origin, destination = dest, : Some strings were matched more than once, and therefore set to <NA> in the result: American Samoa (United States),ASM,USA; Anguilla (United Kingdom),AIA,GBR; Aruba (Netherlands),ABW,NLD; Bermuda (United Kingdom),BMU,GBR; British Virgin Islands (United Kingdom),VGB,GBR; Cayman Islands (United Kingdom),CYM,GBR; Cook Islands (New Zealand),COK,NZL; Curaçao (Netherlands),CUW,NLD; Falkland Islands (United Kingdom),FLK,GBR; Faroe Islands (Denmark),DNK,FRO; French Guiana (France),FRA,GUF; French Polynesia (France),FRA,PYF; Gibraltar (United Kingdom),GIB,GBR; Greenland (Denmark),DNK,GRL; Guadeloupe (France)[v],FRA,GLP; Guam (United States),GUM,USA; Isle of Man (United Kingdom),IMN,GBR; Mayotte (France),FRA,MYT; Montserrat (United Kingdom),MSR,GBR; New Caledonia (France),FRA,NCL; Niue (New Zealand),NZL,NIU; Northern Mariana Islands (United States),MNP,USA; Puerto Rico (United States),PRI,USA; Réunion (France),FRA,REU; Saint Helena (United Kingdom)[y],SHN,GBR; Saint Pierre and Miquelon (France),FRA,SPM; Sint Maarten (Netherlands),NLD,SXM; Tokelau (New Zealand),NZL,TKL; Turks and Caicos Islands (United Kingdom),TCA,GBR; U.S. Virgin Islands (United States),VIR,USA; Wallis and Futuna (France),FRA,WLF
names(PopTable)[1] <- 'COUNTRY'
NewCovid <-inner_join(X2020_statistical_annex_table_3, NewCovid)
## Joining, by = "COUNTRY"
NewCovid <- na.omit(inner_join(PopTable, NewCovid))
## Joining, by = "COUNTRY"
NewCovid[,1] <- countrycode(NewCovid[,1], origin = 'iso3c', destination = 'country.name')
NewCovid$HDI <- as.numeric(NewCovid$HDI)
NewCovid$IneqHDI <- as.numeric(NewCovid$IneqHDI)
## Warning: NAs introduced by coercion
NewCovid <- NewCovid[,-2:-4]
NewCovid <- NewCovid[,-3]
names(NewCovid)[2] <- 'Population'
NewCovid$Population <- as.numeric(gsub(",","",NewCovid$Population))
The data set(s) are from the World Health Organization’s Database,Wikipedia, and the United Nations Development Programme.
CovidWHO is the covid global data, and has the Date Reported, Country Code, Name of Country, WHO Region, New cases, Cumulative cases,New deaths, and Cumulative deaths.
One_Year_Cum has Country Name, WHO Region, Cumulative Cases between March 1st 2021 and 2022, and Cumulative Deaths between March 1st 2021 and 2022
VaccineCovid has Country, ISO3 code for each country, WHO Region, Source of Data, Date Updated, Total vaccinations, Persons vaccinated with one or more dose, Persons Total vaccinated out of 100, Persons vaccinated with one or more dose out of 100, Persons fully vaccinated, Persons fully vaccinated out of 100, Vaccine types used, First vaccination Date, and types of vaccination used.
X2020_statistical_annex_table_3 has Country name, HDI and HDI adjusted for Inequality for 2020.
PopTable has Country (IS03), UN Continental Region, UN Statistical Region, Population for each country as of July 1st 2018, Population for each country as of July 1st 2019, and Percent change of population between those two years.
NormalizedCases <- NewCovid$CaseCum/NewCovid$Population*100000
NormalizedDeaths <- NewCovid$DeathCum/NewCovid$Population*100000
CaseMortalityRate <- (NormalizedDeaths/NormalizedCases)*100
NewCovid2<- na.omit(data.frame(NewCovid$COUNTRY, NewCovid$WHO_REGION, NormalizedCases, NormalizedDeaths, CaseMortalityRate, NewCovid$HDI,NewCovid$PERSONS_FULLY_VACCINATED_PER100, NewCovid$Population))
names(NewCovid2)[1] <- 'COUNTRY'
names(NewCovid2)[2] <- 'WHO_REGION'
names(NewCovid2)[3] <- 'NormCase'
names(NewCovid2)[4] <- 'NormDeath'
names(NewCovid2)[5] <- 'CaseMortalityRate'
names(NewCovid2)[6] <- 'HDI'
names(NewCovid2)[7] <- 'FullyVac100'
names(NewCovid2)[8] <- 'Pop'
View(NewCovid2)
To create an organized dataset for the purpose of modelling, I coerced the datasets I described into NewCovid2, with the variables Country, WHO Region, Normalized Cases (Cumulative Cases per 100000 people) between March 1st 2021 and March 1st 2022, Normalized Deaths (Cumulative Deaths per 100000 people) between March 1st 2021 and March 1st 2022,
The Corona virus Pandemic has been going on since March 2020, and has led to numerous lock downs worldwide and safety precautions being put in place for the last two years, and has eased due to Covid-19 vaccinations being rolled out,distancing measures, and mask mandates.
The vaccine roll out in most countries was a year after the pandemic has started, namely around Early March. This report will attempt to give answers to questions such as: how much variance does the HDI and Persons fully vaccinated (out of 100) explain for case mortality? What model captures this relationship best, and is there a difference of case mortality between each WHO Region?
EXPLORATION OF DATASET
To explore the dataset, I will generate summary statistics for the Variables in NewCovid2 globally as below:
summary(NewCovid2)
## COUNTRY WHO_REGION NormCase NormDeath
## Length:178 Length:178 Min. : 6 Min. : 0.000
## Class :character Class :character 1st Qu.: 591 1st Qu.: 8.733
## Mode :character Mode :character Median : 5408 Median : 38.888
## Mean : 56697 Mean : 188.448
## 3rd Qu.: 12234 3rd Qu.: 93.262
## Max. :8547567 Max. :21854.881
## CaseMortalityRate HDI FullyVac100 Pop
## Min. : 0.0000 Min. :0.3940 Min. : 0.08 Min. :1.801e+04
## 1st Qu.: 0.4374 1st Qu.:0.6012 1st Qu.:22.67 1st Qu.:2.137e+06
## Median : 1.0617 Median :0.7410 Median :50.36 Median :9.138e+06
## Mean : 1.3786 Mean :0.7229 Mean :47.54 Mean :4.135e+07
## 3rd Qu.: 1.8180 3rd Qu.:0.8433 3rd Qu.:71.01 3rd Qu.:2.908e+07
## Max. :15.8194 Max. :0.9570 Max. :97.25 Max. :1.434e+09
Overall globally, there are 178 different countries in my Data Set.
For the variable Normalized Cases, the minimum was 6, 1st Quartile was 590 cases, Mean was 56697 cases, 3rd Quartile was 12234 and Maximum was 8547567 (all out of 100000 persons).
For the variable Normalized Deaths, the minimum was 0, 1st Quartile was 8.733 deaths, Mean was 188.448 deaths, 3rd Quartile was 93.262 and Maximum was 21854.881 (all out of 100000 persons).
For Case Mortality Rate, the minimum was 0%, 1st Quartile was 0.4374%, Mean was 1.3786%, 3rd Quartile was 1.8180%, and maximum was 15.8194%.
For HDI, the minimum was 0.3940, the 1st Quartile was 0.6012, the Mean was 0.7229, 3rd Quartile was 0.8433, and maximum was 0.9570.
For Full Vaccinations, the minimum was .08, 1st Quartile was 22.67, Mean was 47.54, 3rd Quartile was 71.01, and maximum was 97.25 (all out of 100 persons).
MODEL EXPLORATION: CASE MORTALITY RATE versus HDI and PERSONS FULLY VACCINATED (out of 100)
To begin, I will fit a linear model of case mortality rate against HDI and persons fully vaccinated (out of 100). The assumptions for this model that must be satisfied are normality, homoscedasticity (if there is equal variance in the standardized residuals), if the errors are independent (no Autocorrelation), and if my independent variables (or predictors) have significant multicollinearity.
check_outlier <- function(v, coef=1.5){
quantiles <- quantile(v,probs=c(0.25,0.75))
IQR <- quantiles[2]-quantiles[1]
res <- v < (quantiles[1]-coef*IQR)|v > (quantiles[2]+coef*IQR)
return(res)
}
LModel <- lm(formula = NewCovid2$CaseMortalityRate ~ NewCovid2$HDI*NewCovid2$FullyVac100)
summary(LModel)
##
## Call:
## lm(formula = NewCovid2$CaseMortalityRate ~ NewCovid2$HDI * NewCovid2$FullyVac100)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5243 -0.6380 -0.2213 0.2587 13.3146
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.96236 1.01886 2.908 0.00412 **
## NewCovid2$HDI -0.93183 1.72283 -0.541 0.58929
## NewCovid2$FullyVac100 -0.00937 0.02192 -0.427 0.66959
## NewCovid2$HDI:NewCovid2$FullyVac100 -0.01237 0.02967 -0.417 0.67714
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.41 on 174 degrees of freedom
## Multiple R-squared: 0.1887, Adjusted R-squared: 0.1748
## F-statistic: 13.49 on 3 and 174 DF, p-value: 5.876e-08
ggplot() + aes(x = "", y = NewCovid2$CaseMortalityRate) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal()
ggplot() + aes(x = "", y = NewCovid2$HDI) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal()
ggplot() + aes(x = "", y = NewCovid2$FullyVac100) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal()
shapiro.test(studres(LModel))
##
## Shapiro-Wilk normality test
##
## data: studres(LModel)
## W = 0.48376, p-value < 2.2e-16
Looking at the box plot of Case Mortality Rate, there seems to be outliers in the data, which may communicate that the data will not satisfy Normality. This is supported up by the Shapiro Wilk Test, which has a p value of \[p = 1.358 \times 10^{-10} < .05\]
\[H_0: \text{Resiudals Normally Distributed } H_a: \text{Residuals not Normally Distributed}\] Hence, we reject the Null Hypothesis, that the residuals are normally distributed (Normality) in favor of the Alternative Hypothesis, that the residuals are not normally distributed. To fix this a boxcox transformation would probably be best, as well as checking if there still are outliers and potentially removing them as necessary. These two measures will likely allow the normality assumption to be satisfied, and allow us to continue with model diagnostics.
NewCovid2[NewCovid2 == 0] <- NA
NewCovid2 <- na.omit((NewCovid2)) #remove variables with 0 as to allow for boxcox
bc <- boxcox(NewCovid2$CaseMortalityRate ~ NewCovid2$HDI*NewCovid2$FullyVac100)
(lambda <- bc$x[which.max(bc$y)])
## [1] 0.1414141
new_model <- lm(((NewCovid2$CaseMortalityRate^lambda-1)/lambda) ~ NewCovid2$HDI*NewCovid2$FullyVac100)
data <- ((NewCovid2$CaseMortalityRate^lambda-1)/lambda)
outlier <- check_outlier(data)
label <- ifelse(outlier,NewCovid2$COUNTRY,"")
ggplot() + aes(x = "", y = ((NewCovid2$CaseMortalityRate^lambda-1)/lambda)) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal()+geom_text(aes(label=label),hjust=-1)
plot(new_model, 1, id.n = 5)
plot(new_model, 2, id.n = 5)
plot(new_model, 3, id.n = 5)
plot(new_model, 4, id.n = 5)
plot(new_model, 5, id.n = 5)
CorMat <- data.frame(((NewCovid2$CaseMortalityRate^lambda-1)/lambda),NewCovid2$HDI,NewCovid2$FullyVac100)
CorMat <- cor(CorMat)
CorMat
## X..NewCovid2.CaseMortalityRate.lambda...1..lambda.
## X..NewCovid2.CaseMortalityRate.lambda...1..lambda. 1.0000000
## NewCovid2.HDI -0.5085029
## NewCovid2.FullyVac100 -0.5552074
## NewCovid2.HDI
## X..NewCovid2.CaseMortalityRate.lambda...1..lambda. -0.5085029
## NewCovid2.HDI 1.0000000
## NewCovid2.FullyVac100 0.7973061
## NewCovid2.FullyVac100
## X..NewCovid2.CaseMortalityRate.lambda...1..lambda. -0.5552074
## NewCovid2.HDI 0.7973061
## NewCovid2.FullyVac100 1.0000000
findCorrelation(abs(CorMat), cutoff = 0.8, names = TRUE)
## character(0)
summary(new_model)
##
## Call:
## lm(formula = ((NewCovid2$CaseMortalityRate^lambda - 1)/lambda) ~
## NewCovid2$HDI * NewCovid2$FullyVac100)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.00184 -0.42056 -0.00359 0.47031 2.97403
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.80208 0.56901 -1.410 0.160472
## NewCovid2$HDI 2.55112 0.96216 2.651 0.008768 **
## NewCovid2$FullyVac100 0.04703 0.01234 3.812 0.000192 ***
## NewCovid2$HDI:NewCovid2$FullyVac100 -0.08858 0.01664 -5.322 3.19e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7871 on 171 degrees of freedom
## Multiple R-squared: 0.4168, Adjusted R-squared: 0.4065
## F-statistic: 40.73 on 3 and 171 DF, p-value: < 2.2e-16
shapiro.test(studres(new_model))
##
## Shapiro-Wilk normality test
##
## data: studres(new_model)
## W = 0.97138, p-value = 0.001136
model.diag.metrics <- augment(new_model)
model.diag.metrics %>%
top_n(5, wt = .cooksd)
## # A tibble: 5 × 9
## `((NewCovid2$C…` `NewCovid2$HDI` `NewCovid2$Ful…` .fitted .resid .hat .sigma
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2.14 0.51 5.32 0.509 1.64 0.0275 0.779
## 2 3.38 0.47 1.29 0.404 2.97 0.0429 0.754
## 3 -2.70 0.433 0.08 0.303 -3.00 0.0588 0.753
## 4 -2.57 0.654 74.5 0.0545 -2.63 0.0479 0.762
## 5 -2.84 0.949 80.9 -1.38 -1.47 0.0343 0.781
## # … with 2 more variables: .cooksd <dbl>, .std.resid <dbl>
model.diag.metrics %>%
top_n(5, wt = .std.resid)
## # A tibble: 5 × 9
## `((NewCovid2$CaseMor…` `NewCovid2$HDI` `NewCovid2$Ful…` .fitted .resid .hat
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.25 0.779 61.1 -0.157 1.41 0.00898
## 2 2.14 0.51 5.32 0.509 1.64 0.0275
## 3 1.54 0.777 73.1 -0.414 1.96 0.0143
## 4 3.38 0.47 1.29 0.404 2.97 0.0429
## 5 1.39 0.759 76.7 -0.415 1.81 0.0200
## # … with 3 more variables: .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>
It seems that Normality is not satisfied yet, so the next course of action would be to remove outliers based off their cook’s distances. Based off the cooks distance plot, the data points that have the highest cook distances are points 45, 77, and 151 which correspond to Yemen (with a case mortality rate of \(\approx 15.9\%\)), Burundi (with a case mortality rate of \(\approx 0.0334\%\)) and Guyana (with a case mortality rate of \(\approx 1.885\%\)) With regard to Yemen, it has the highest case mortality rate globally, and is an extreme outlier, not following the overall trend. With regard to Burundi, it has a very low case mortality rate compared to it’s HDI (0.433) and persons fully vaccinated (0.080 out of 100), and Guyana has a fairly low case mortality rate compared to it’s HDI (0.682) and persons fully vaccinated (44.467 out of 100). Hence, these data points do not follow the general trend associated with the data, and there will likely be a marked improvement in the normality of the data once these points are removed.
NewCovid2 <- NewCovid2[-c(45,77,151), ]
bc <- boxcox(NewCovid2$CaseMortalityRate ~ NewCovid2$HDI*NewCovid2$FullyVac100)
(lambda <- bc$x[which.max(bc$y)])
## [1] 0.1414141
new_model <- lm(((NewCovid2$CaseMortalityRate^lambda-1)/lambda) ~ NewCovid2$HDI*NewCovid2$FullyVac100)
confint(new_model)
## 2.5 % 97.5 %
## (Intercept) -1.72185026 0.33293762
## NewCovid2$HDI 0.56501098 3.99811491
## NewCovid2$FullyVac100 0.02977906 0.07448294
## NewCovid2$HDI:NewCovid2$FullyVac100 -0.12220154 -0.06234453
CorMat <- data.frame(((NewCovid2$CaseMortalityRate^lambda-1)/lambda),NewCovid2$HDI,NewCovid2$FullyVac100)
CorMat <- cor(CorMat)
CorMat
## X..NewCovid2.CaseMortalityRate.lambda...1..lambda.
## X..NewCovid2.CaseMortalityRate.lambda...1..lambda. 1.0000000
## NewCovid2.HDI -0.5621095
## NewCovid2.FullyVac100 -0.5876201
## NewCovid2.HDI
## X..NewCovid2.CaseMortalityRate.lambda...1..lambda. -0.5621095
## NewCovid2.HDI 1.0000000
## NewCovid2.FullyVac100 0.7954318
## NewCovid2.FullyVac100
## X..NewCovid2.CaseMortalityRate.lambda...1..lambda. -0.5876201
## NewCovid2.HDI 0.7954318
## NewCovid2.FullyVac100 1.0000000
findCorrelation(abs(CorMat), cutoff = 0.9, names = TRUE)
## character(0)
plot(new_model, 1, id.n = 5)
plot(new_model, 2, id.n = 5)
plot(new_model, 3, id.n = 5)
plot(new_model, 4, id.n = 5)
plot(new_model, 5, id.n = 5)
hist(rstandard(new_model))
summary(new_model)
##
## Call:
## lm(formula = ((NewCovid2$CaseMortalityRate^lambda - 1)/lambda) ~
## NewCovid2$HDI * NewCovid2$FullyVac100)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.55965 -0.41834 -0.02625 0.44482 1.89653
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.69446 0.52041 -1.334 0.18387
## NewCovid2$HDI 2.28156 0.86950 2.624 0.00949 **
## NewCovid2$FullyVac100 0.05213 0.01132 4.604 8.13e-06 ***
## NewCovid2$HDI:NewCovid2$FullyVac100 -0.09227 0.01516 -6.087 7.58e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6936 on 168 degrees of freedom
## Multiple R-squared: 0.4836, Adjusted R-squared: 0.4744
## F-statistic: 52.44 on 3 and 168 DF, p-value: < 2.2e-16
shapiro.test(studres(new_model))
##
## Shapiro-Wilk normality test
##
## data: studres(new_model)
## W = 0.99487, p-value = 0.818
bptest(new_model)
##
## studentized Breusch-Pagan test
##
## data: new_model
## BP = 4.7717, df = 3, p-value = 0.1893
durbinWatsonTest(new_model)
## lag Autocorrelation D-W Statistic p-value
## 1 0.06751893 1.86198 0.3
## Alternative hypothesis: rho != 0
model.diag.metrics <- augment(new_model)
model.diag.metrics %>%
top_n(5, wt = .cooksd)
## # A tibble: 5 × 9
## `((NewCovid2$C…` `NewCovid2$HDI` `NewCovid2$Ful…` .fitted .resid .hat .sigma
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2.14 0.51 5.32 0.496 1.65 0.0305 0.683
## 2 1.32 0.394 6.37 0.305 1.02 0.0688 0.691
## 3 -0.686 0.433 3.86 0.340 -1.03 0.0565 0.691
## 4 -2.86 0.931 82.1 -1.34 -1.52 0.0310 0.685
## 5 -2.84 0.949 80.9 -1.40 -1.45 0.0347 0.686
## # … with 2 more variables: .cooksd <dbl>, .std.resid <dbl>
model.diag.metrics %>%
top_n(5, wt = .std.resid)
## # A tibble: 5 × 9
## `((NewCovid2$CaseMor…` `NewCovid2$HDI` `NewCovid2$Ful…` .fitted .resid .hat
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.25 0.779 61.1 -0.123 1.38 0.00926
## 2 2.14 0.51 5.32 0.496 1.65 0.0305
## 3 1.54 0.777 73.1 -0.353 1.90 0.0148
## 4 1.11 0.782 66.2 -0.236 1.34 0.0104
## 5 1.39 0.759 76.7 -0.336 1.73 0.0209
## # … with 3 more variables: .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>
To reiterate, the null and alternative hypothesis for normality are as follows, \[H_0: \text{Resiudals Normally Distributed } H_a: \text{Residuals not Normally Distributed}\]
Since the Shapiro-Wilk normality test has has a p-value of \(p = 0.818 > .05\), we can accept the null hypothesis, and normality is now satisfied. This is supported by the Q-Q plot, as the residuals approximately follow a normal distribution. With Normality satisfied, the other assumptions can now be checked.
Now, we must check homoscedasticity. The null and alternative hypothesis for homoscedasticity are as follows,\[H_0: \text{constant variances in the residuals errors (homoscedasticity) } H_a: \text{non-constant variances in the residuals errors}\]
Since the studentized Breusch-Pagan test has a p-value of \(p = 0.1893 > .05\), we accept the null hypothesis, and homoscedasticity is satisfied. This is supported by the fact that the scale-location plot has a horizontal line with equally spread points. This is supported because in our Scale-Location plot, we have a horizontal line with equally spread points.
Now, we must check for independence, or autocorrelation. The null and alternative hypothesis for autocorrelation are as follows,\[H_0: \text{errors are independent (no autocorrelation) } H_a: \text{errors are not independent (autocorrelation)}\]
Since the Durbin-Watson test has a p-value of \(p = 0.372 > .05\), we accept the null hypothesis, and independence is satisfied
Finally, I believe there is not significant correlation, or Multicollinearity in our model. Although the persons fully vaccinated (out of 100) and HDI have a correlation of 0.7954318, these two variables give different information with regard to the dependent variable, case mortality rate. For example, although how developed a nation is (HDI) is correlated with how many people are fully vaccinated (out of 100), due to people in more developed nations having better access to vaccines to begin with, the HDI also accounts for other factors such as life expectancy, years of education, and per capita income, which are related to mortality rate as countries without a high life expectancy will probably be hit harder by covid (individuals more malnourished, etc). Countries with high per capita income probably also have better access to medical devices, medicines, and overall a more robust healthcare system, all which decrease case mortality rate, something not accounted for in persons fully vaccinated (out of 100) Hence, I believe these two variables communicate some degree of differing information. Adding to this, none of the values in my correlation matrix have a correlation over .9, so they carry differing information. Both of my predictor variables have moderate correlation with the output (case mortality rate), so they should both be kept in the model.
Now, I will check if my coefficients are significant (non-zero). My \(\alpha = \frac{.05}{4} = .0125\), as I need to use the Bonferroni correction and divide by the number of hypothesis tests, and my confidence interval is 95%
Since for \(\beta_0\) \(p = 0.1838 > .0125 = \alpha\) and my null and alternative hypothesis are as follows \[H_0: \beta_0 \neq 0, H_a: \beta_0 = 0\], I reject my null hypothesis in favor of the alternative, \(\beta_0 = 0\)
Since for \(\beta_1\) \(p = 0.00949 < .0125 = \alpha\) and my null and alternative hypothesis are as follows \[H_0: \beta_1 \neq 0, H_a: \beta_1 = 0\], I accept my null hypothesis, \(\beta_1 \neq 0\)
Since for \(\beta_2\) \(p = 8.13 \times 10 ^{-6} < .0125 = \alpha\) and my null and alternative hypothesis are as follows \[H_0: \beta_2 \neq 0, H_a: \beta_2 = 0\], I accept my null hypothesis, \(\beta_2 \neq 0\)
Since for \(\beta_{1,2}\) \(p = 7.58 \times 10 ^{-9} < .0125 = \alpha\) and my null and alternative hypothesis are as follows \[H_0: \beta_{1,2} \neq 0, H_a: \beta_{1,2} = 0\], I accept my null hypothesis, \(\beta_{1,2} \neq 0\)
This is supported by their 95% confidence intervals \[\beta_0 \in [-1.72185026, 0.33293762], \beta_1 \in [0.56501098, 3.99811491], \beta_2 \in [0.02977906,0.07448294], \beta_{1,2} \in [-0.12220154, -0.06234453]\] as \(\beta_0\)’s contains 0 while the rest do not.
This model also has an overall p value of \(p = 2.2 \times 10^{-16}\), so there is a significant relationship described by the model Finally, this model has \(R^2 = 0.4836\), meaning that this model explains roughly half the variance in the dependent variable.
The model is as follows
\[\frac{\hat{Y_i}^{\lambda}-1}{\lambda} = \beta_1 X_{1_i} + \beta_2 X_{2_i} -\beta_{1,2}X_{1_i} X_{2_i} \] \[\frac{\hat{Y_i}^{0.1414141}-1}{0.1414141} = 2.28156 X_{1_i} + 0.05213 X_{2_i} -0.09227X_{1_i} X_{2_i} \] \[{\hat{Y_i}^{0.1414141}} = 1+ 0.3226448 X_{1_i} + 0.0073719 X_{2_i} −0.0130483X_{1_i} X_{2_i} \]
\[{\hat{Y_i}^{0.1414141}} = 1+ 0.3226448 X_{1_i} + 0.0073719 X_{2_i} −0.0130483X_{1_i} X_{2_i} \] \[\hat{Y_i} = (1+ 0.3226448 X_{1_i} + 0.0073719 X_{2_i} −0.0130483X_{1_i} X_{2_i})^{7.07143064235}\]
with \(\hat{Y}\) being case mortality rate (as %), \(X_1\) being \(HDI\) and \(X_2\) being persons fully vaccinated (out of 100).
Below is a dot plot of the data,
CovidCumulative <- NewCovid2 %>% arrange(desc(CaseMortalityRate))%>%
mutate(COUNTRY= factor(COUNTRY)) %>%
mutate(text = paste("Country: ", COUNTRY, "\nPersons Fully Vaccinated (per 100 population): ", FullyVac100, "\nCumulative Cases (per 100000 pop): ", NormCase, "\nNew Deaths in last 7 Days (per 100000 pop): ", NormDeath, "\nCase Mortality Rate: ", CaseMortalityRate ,"\nHDI:", HDI,"\nPopulation:", `Pop`, sep="")) %>%
ggplot(aes(x=HDI, y = ((NewCovid2$CaseMortalityRate^lambda-1)/lambda), size = FullyVac100, color = WHO_REGION, text = text)) +
geom_point(alpha=0.5) +
scale_size(range = c(.1, 7), name="Persons Fully Vaccinated (per 100 population)")+
theme(legend.position="right") +
ylab("boxcox(Case Mortality Rate (as %))") +
xlab("HDI") +
scale_colour_viridis_d(option = "turbo")
CovidCumulativeInteractive <- ggplotly(CovidCumulative, tooltip = "text")
CovidCumulativeInteractive
Overall, based off this model we can conclude that around half of the variance in the predictor variable is based on how many people are fully vaccinated (out of 100), and the HDI of the country being looked at. Hence, this confirms that Covid in the past year or so has hit underdeveloped nations, and nations without access to vaccinations the hardest. This is important because guaranteeing a better living standard, and sharing vaccines with impoverished nations can significantly decrease their case mortality rate, and save lives. Next, I will compare the case mortality rate of each World Health Organization Region, which may support this idea.
MODEL EXPLORATION: ANOVA, CASE MORTALITY RATE, AND PERSONS FULLY VACCINATED (OUT OF 100)
boxcoxCaseMortalityRate <- ((NewCovid2$CaseMortalityRate^lambda-1)/lambda)
WHO_REGION <- as.factor(NewCovid2$WHO_REGION)
data <- data.frame(boxcoxCaseMortalityRate,WHO_REGION)
new_model_AOV <- aov(boxcoxCaseMortalityRate~WHO_REGION)
p <- ggplot(data, aes(x=WHO_REGION, y=boxcoxCaseMortalityRate, fill=WHO_REGION)) +
geom_boxplot() + labs(title="Plot of length per dose",x="Dose (mg)", y = "Length")
p
summary(new_model_AOV)
## Df Sum Sq Mean Sq F value Pr(>F)
## WHO_REGION 5 34.69 6.937 9.454 6e-08 ***
## Residuals 166 121.82 0.734
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bptest(new_model_AOV)
##
## studentized Breusch-Pagan test
##
## data: new_model_AOV
## BP = 16.788, df = 5, p-value = 0.00492
shapiro.test(studres(new_model_AOV))
##
## Shapiro-Wilk normality test
##
## data: studres(new_model_AOV)
## W = 0.99443, p-value = 0.7646
plot(new_model_AOV,1)
plot(new_model_AOV,2)
plot(new_model_AOV,3)
plot(new_model_AOV,4)
plot(new_model_AOV,5)
plot(new_model_AOV,6)
hist(boxcoxCaseMortalityRate)
My ANOVA is a model of \[\frac{\hat{y_{ij}}^{\lambda}-1}{\lambda} = \mu_{j}+ e_{ij}\]
Where \(y_{ij}\) is the real valued response for the ith subject of the jth factor level, \(\mu_j\) is the population mean of the jth factor level, and \(e_{ij} \sim N(0,\sigma^2)\) An Anova model allows us to determine if there is a difference in the means of different factors, in this case we are interested in seeing if there is a difference in the mean case mortality rate by WHO Region.
I am using the same box-cox transformation I used in my first model, as it seems to normalize the predictor variable, as shown by the histogram above.
For this model, we need to check the assumptions of homoscedasticity (homogeneity of variance), normality, and independence.
For my ANOVA Model, my Breusch-Pagan test has a p value of \(p = 0.00492 < .05 = \alpha\) so it does not satisfy homoscedasticity (homogeneity of variance). However the p value of the shapiro test is \(p = 0.7646 > .05= \alpha\) so the model is normal (satisfies normality). Hence, I can use a Welches Anova which assumes Normality but not homoscedasticity. I will also assume independence of observations, as one regions case mortality rate should not affect anthers.
new_model_welch <- oneway.test(boxcoxCaseMortalityRate~WHO_REGION)
new_model_welch
##
## One-way analysis of means (not assuming equal variances)
##
## data: boxcoxCaseMortalityRate and WHO_REGION
## F = 8.6415, num df = 5.000, denom df = 45.427, p-value = 8.177e-06
Because the p-value of my Welch Anova is \(p = 8.177\times 10^{-6} < .05 = \alpha\), I can assume that there is a difference in means between groups. To figure out what means differ, I will use a Games-Howell Test.
games_howell_test(data, boxcoxCaseMortalityRate~WHO_REGION ,conf.level = 0.95, detailed = FALSE)
## # A tibble: 15 × 8
## .y. group1 group2 estimate conf.low conf.high p.adj p.adj.signif
## * <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 boxcoxCaseMor… AFRO AMRO 0.0128 -0.430 0.456 1 e+0 ns
## 2 boxcoxCaseMor… AFRO EMRO -0.146 -0.972 0.680 9.93e-1 ns
## 3 boxcoxCaseMor… AFRO EURO -0.900 -1.39 -0.411 1.02e-5 ****
## 4 boxcoxCaseMor… AFRO SEARO -0.130 -1.28 1.03 9.98e-1 ns
## 5 boxcoxCaseMor… AFRO WPRO -1.07 -1.88 -0.256 6 e-3 **
## 6 boxcoxCaseMor… AMRO EMRO -0.159 -1.00 0.687 9.92e-1 ns
## 7 boxcoxCaseMor… AMRO EURO -0.913 -1.44 -0.383 4.29e-5 ****
## 8 boxcoxCaseMor… AMRO SEARO -0.142 -1.30 1.02 9.98e-1 ns
## 9 boxcoxCaseMor… AMRO WPRO -1.08 -1.91 -0.248 6 e-3 **
## 10 boxcoxCaseMor… EMRO EURO -0.754 -1.62 0.113 1.17e-1 ns
## 11 boxcoxCaseMor… EMRO SEARO 0.0166 -1.26 1.29 1 e+0 ns
## 12 boxcoxCaseMor… EMRO WPRO -0.922 -1.97 0.127 1.13e-1 ns
## 13 boxcoxCaseMor… EURO SEARO 0.770 -0.397 1.94 2.92e-1 ns
## 14 boxcoxCaseMor… EURO WPRO -0.168 -1.02 0.685 9.9 e-1 ns
## 15 boxcoxCaseMor… SEARO WPRO -0.939 -2.21 0.331 2.24e-1 ns
Based upon my Games-Howell Test, there is a significant difference in the means of Europe and Africa, the Western Pacific and Africa, Europe and the Americas, and the West Pacific and the Americas. In particular, the 95% Confidence interval in the difference of the means of Europe and Africa is \([-1.3880957, -0.4114904]\), the 95% Confidence interval in the difference of the means of the Western Pacific and Africa is \([-1.8808796, -0.2556291]\), the 95% Confidence interval in the difference of the means of the Europe and the Americas is \([-1.4422163 -0.3829553]\) and, the 95% Confidence interval in the difference of the means of the West Pacific and the Americas is \([-1.9139549, -0.2481394]\).
This communicates that Africa has a significantly larger Mortality Rate than Europe, the Americas has a significantly larger mortality rate than Europe, Africa has a significantly larger Mortality Rate than the West Pacific and the Americas has a significantly larger mortality rate than the West Pacific. This is significant because, coupled with the linear model fitted from earlier, it seems very likely that more developed regions (such as Europe and the West Pacific), have markedly lower case mortality rates than regions that are less developed overall (such as the Americas as a whole, and Africa). With this knowledge, we should give/offer vaccinations to less developed parts of the world to prevent more deaths, and decrease their case mortality rates.
CONCLUSION
Overall, through exploring the data and using different models we were able to get a lot of insight into mechanisms of the covid pandemic, in regard to case mortality rate. Through fitting a linear regression model with BoxCox of Case Mortality Rate against HDI and Persons Fully Vaccinated (out of 100), we learned these two explanatory variables, and their interaction terms can explain around half the variance in the predictor variable. This communicated that how developed a country is, along with how many persons out of 100 are fully vaccinated, significantly changes the case mortality rate, as having a higher HDI and higher persons fully vaccinated out of 100 results in a lower case mortality rate.
The secondary model is an ANOVA model comparing each WHO’s region case mortality rate, as transformed by the same BoxCox in the linear regression model. This model communicated that both Europe and the West Pacific have a markedly lower case mortality rate than that both of the Americas and Africa. This ties into the first model as Europe and the West Pacific are generally higher developed regions, with more persons fully vaccinated overall than the Americas and Africa. With this knowledge, we should give excess vaccines to underdeveloped regions, such as Africa, do prevent deaths and eventually decrease their case mortality rate.
Some Caveats with this model is that the HDI is from 2020, and population that I used to normalize the Cumulative Deaths and Cases is from 2019, although these likely are not significant enough stressors to dampen my models accuracy, as the HDI of nations and their population likely were quite similar between 2020 and 2022 (they don’t change much). My cumulative cases and cumulative deaths also are from March 1st of 2021 to March 1st of 2022, as the former date I felt was a good predictor of when most countries started offering vaccinations for COVID-19, so this should only be limited to that time frame. This model also assumes integrity and honestly in reporting covid case and death numbers, along with the capability to do so in the first place. Further analysis of the relationship between the ability to report cases, and how developed a nation is would be good, as it would help us understand if underdeveloped nations have less of a capability to report cases and deaths. Finally, the persons fully vaccinated (out of 100) is the most up to date one for each nation, and most definitely changed over the course of a year, which decreased the case mortality rate as persons fully vaccinated out of 100 increased. However, a better, more thorough analysis would require better data, as in how persons fully vaccinated changed over time, as the model is likely very sensitive to it changing.
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] readxl_1.3.1 rgl_0.108.3 rstatix_0.7.0 broom_0.7.12
## [5] lmtest_0.9-39 zoo_1.8-9 car_3.0-12 carData_3.0-5
## [9] reshape2_1.4.4 rvest_1.0.2 countrycode_1.3.1 viridis_0.6.2
## [13] viridisLite_0.4.0 hrbrthemes_0.8.0 class_7.3-20 caret_6.0-90
## [17] lattice_0.20-45 MASS_7.3-54 glmnet_4.1-3 Matrix_1.3-4
## [21] ggpubr_0.4.0 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.8
## [25] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0 tibble_3.1.6
## [29] tidyverse_1.3.1 plotly_4.10.0 ggplot2_3.3.5
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-2 selectr_0.4-2 ggsignif_0.6.3
## [4] ellipsis_0.3.2 fs_1.5.2 rstudioapi_0.13
## [7] farver_2.1.0 listenv_0.8.0 bit64_4.0.5
## [10] prodlim_2019.11.13 fansi_1.0.2 lubridate_1.8.0
## [13] xml2_1.3.3 codetools_0.2-18 splines_4.1.2
## [16] extrafont_0.17 knitr_1.37 jsonlite_1.7.3
## [19] pROC_1.18.0 Rttf2pt1_1.3.10 dbplyr_2.1.1
## [22] compiler_4.1.2 httr_1.4.2 backports_1.4.1
## [25] assertthat_0.2.1 fastmap_1.1.0 lazyeval_0.2.2
## [28] cli_3.1.1 htmltools_0.5.2 tools_4.1.2
## [31] gtable_0.3.0 glue_1.6.1 Rcpp_1.0.8
## [34] cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.3.8
## [37] nlme_3.1-153 extrafontdb_1.0 crosstalk_1.2.0
## [40] iterators_1.0.14 timeDate_3043.102 gower_1.0.0
## [43] xfun_0.29 globals_0.14.0 lifecycle_1.0.1
## [46] future_1.23.0 scales_1.1.1 ipred_0.9-12
## [49] vroom_1.5.7 hms_1.1.1 parallel_4.1.2
## [52] curl_4.3.2 yaml_2.2.2 gridExtra_2.3
## [55] gdtools_0.2.4 rpart_4.1-15 stringi_1.7.6
## [58] highr_0.9 foreach_1.5.2 lava_1.6.10
## [61] shape_1.4.6 rlang_1.0.1 pkgconfig_2.0.3
## [64] systemfonts_1.0.4 evaluate_0.14 labeling_0.4.2
## [67] recipes_0.1.17 htmlwidgets_1.5.4 bit_4.0.4
## [70] tidyselect_1.1.1 parallelly_1.30.0 plyr_1.8.6
## [73] magrittr_2.0.2 R6_2.5.1 generics_0.1.2
## [76] DBI_1.1.2 pillar_1.7.0 haven_2.4.3
## [79] withr_2.4.3 survival_3.2-13 abind_1.4-5
## [82] nnet_7.3-16 future.apply_1.8.1 modelr_0.1.8
## [85] crayon_1.4.2 utf8_1.2.2 tzdb_0.2.0
## [88] rmarkdown_2.11 grid_4.1.2 data.table_1.14.2
## [91] ModelMetrics_1.2.2.2 reprex_2.0.1 digest_0.6.29
## [94] stats4_4.1.2 munsell_0.5.0